In [1]:
def search(pattern, text):
"""Return true if pattern appears anywhere in text."""
if pattern[0] == "^":
return match(pattern[1:], text)
else:
return match(".*"+pattern, text)
In [2]:
def match1(p, text):
"""Return true if first character of text matches pattern character p."""
if not text: return False
return p == '.' or p == text[0]
In [6]:
def match_star(p, pattern, text):
"""Return true if any number of char p, followed by pattern, matching text."""
return (match1(p, text) and match_star(p, pattern, text[1:])) or match(pattern, text)
In [5]:
def match(pattern, text):
"""Return True if pattern appears at the start of text."""
if pattern == "":
return True
elif pattern == "$":
return (text == "")
elif len(pattern) > 1 and pattern[1] in "*?":
p, op, pat = pattern[0], pattern[1], pattern[2:]
if op == "*":
return match_star(p, pat, text)
elif op == "?":
if match1(p, text) and match(pat, text[1:]):
return True
else:
return match(pat, text)
else:
return match1(pattern[0], text) and match(pattern[1:], text[1:])
In [10]:
def test():
assert search('baa*!', 'Sheep said baaaa!')
assert search('baa*!', 'Sheep said baaaa humbug') == False
assert match('baa*!', 'Sheep said baaaa!') == False
print "All test pass!"
test()
In [ ]:
def lit(string): return ('lit', string)
def seq(x, y): return ('seq', x, y)
def alt(x, y): return ('alt', x, y)
def star(x): return ('star', x)
def plus(x): return ('seq', x, star(x))
def opt(x): return alt(lit(''), x) #opt(x) means that x is optional
def oneof(chars): return ('oneof', tuple(chars))
dot = ('dot',)
eol = ('eol',)
def test():
assert lit('abc') == ('lit', 'abc')
assert seq(('lit', 'a'),
('lit', 'b')) == ('seq', ('lit', 'a'), ('lit', 'b'))
assert alt(('lit', 'a'),
('lit', 'b')) == ('alt', ('lit', 'a'), ('lit', 'b'))
assert star(('lit', 'a')) == ('star', ('lit', 'a'))
assert plus(('lit', 'c')) == ('seq', ('lit', 'c'),
('star', ('lit', 'c')))
assert opt(('lit', 'x')) == ('alt', ('lit', ''), ('lit', 'x'))
assert oneof('abc') == ('oneof', ('a', 'b', 'c'))
return 'tests pass'
print test()
In [ ]:
def matchset(pattern, text):
"Match pattern at start of text; return a set of remainders of text."
op, x, y = components(pattern)
if 'lit' == op:
return set([text[len(x):]]) if text.startswith(x) else null
elif 'seq' == op:
return set(t2 for t1 in matchset(x, text) for t2 in matchset(y, t1))
elif 'alt' == op:
return matchset(x, text) | matchset(y, text)
elif 'dot' == op:
return set([text[1:]]) if text else null
elif 'oneof' == op:
return set([text[1:]]) if text.startswith(x) else null
elif 'eol' == op:
return set(['']) if text == '' else null
elif 'star' == op:
return (set([text]) |
set(t2 for t1 in matchset(x, text)
for t2 in matchset(pattern, t1) if t1 != text))
else:
raise ValueError('unknown pattern: %s' % pattern)
null = frozenset()
def components(pattern):
"Return the op, x, and y arguments; x and y are None if missing."
x = pattern[1] if len(pattern) > 1 else None
y = pattern[2] if len(pattern) > 2 else None
return pattern[0], x, y
def test():
assert matchset(('lit', 'abc'), 'abcdef') == set(['def'])
assert matchset(('seq', ('lit', 'hi '),
('lit', 'there ')),
'hi there nice to meet you') == set(['nice to meet you'])
assert matchset(('alt', ('lit', 'dog'),
('lit', 'cat')), 'dog and cat') == set([' and cat'])
assert matchset(('dot',), 'am i missing something?') == set(['m i missing something?'])
assert matchset(('oneof', 'a'), 'aabc123') == set(['abc123'])
assert matchset(('eol',),'') == set([''])
assert matchset(('eol',),'not end of line') == frozenset([])
assert matchset(('star', ('lit', 'hey')), 'heyhey!') == set(['!', 'heyhey!', 'hey!'])
return 'tests pass'
print test()
In [ ]:
def search(pattern, text):
"Match pattern anywhere in text; return longest earliest match or None."
for i in range(len(text)):
m = match(pattern, text[i:])
if m is not None:
return m
def match(pattern, text):
"Match pattern against start of text; return longest match found or None."
remainders = matchset(pattern, text)
if remainders:
shortest = min(remainders, key=len)
return text[:len(text)-len(shortest)]
def test():
assert match(('star', ('lit', 'a')),'aaabcd') == 'aaa'
assert match(('alt', ('lit', 'b'), ('lit', 'c')), 'ab') == None
assert match(('alt', ('lit', 'b'), ('lit', 'a')), 'ab') == 'a'
assert match(seq(star(lit('a')), star(lit('b'))), 'abbv') == 'abb'
assert search(('alt', ('lit', 'b'), ('lit', 'c')), 'ab') == 'b'
return 'tests pass'
print test()
In [13]:
def n_ary(f):
"""Given binary function f(x, y), return an n_ary function such
that f(x, y, z) = f(x, f(y,z)), etc. Also allow f(x) = x."""
def n_ary_f(x, *args):
return x if not args else f(x, n_ary_f(*args))
return n_ary_f
def test():
f = lambda x, y: ('seq', x, y)
g = n_ary(f)
assert g(2,3,4) == ('seq', 2, ('seq', 3, 4))
assert g(2) == 2
assert g(2,3) == ('seq', 2, 3)
return "tests pass"
print test()
In [1]:
from functools import update_wrapper
def n_ary(f):
"""Given binary function f(x, y), return an n_ary function such
that f(x, y, z) = f(x, f(y,z)), etc. Also allow f(x) = x."""
def n_ary_f(x, *args):
return x if not args else f(x, n_ary_f(*args))
update_wrapper(n_ary_f, f) # update the function name and doc
return n_ary_f
@n_ary
def seq(x, y): return ('seq', x, y)
help(seq)
In [27]:
def decorator(d):
def _d(fn):
return update_wrapper(d(fn), fn)
return update_wrapper(_d, d)
@decorator
def n_ary(f):
"""Given binary function f(x, y), return an n_ary function such
that f(x, y, z) = f(x, f(y,z)), etc. Also allow f(x) = x."""
def n_ary_f(x, *args):
return x if not args else f(x, n_ary_f(*args))
return n_ary_f
@n_ary
def seq(x, y): return ('seq', x, y)
help(seq)
help(n_ary)
print seq(2,3,4)
In [ ]:
from functools import update_wrapper
from string import split
import re
def grammar(description, whitespace=r'\s*'):
"""Convert a description to a grammar. Each line is a rule for a
non-terminal symbol; it looks like this:
Symbol => A1 A2 ... | B1 B2 ... | C1 C2 ...
where the right-hand side is one or more alternatives, separated by
the '|' sign. Each alternative is a sequence of atoms, separated by
spaces. An atom is either a symbol on some left-hand side, or it is
a regular expression that will be passed to re.match to match a token.
Notation for *, +, or ? not allowed in a rule alternative (but ok
within a token). Use '\' to continue long lines. You must include spaces
or tabs around '=>' and '|'. That's within the grammar description itself.
The grammar that gets defined allows whitespace between tokens by default;
specify '' as the second argument to grammar() to disallow this (or supply
any regular expression to describe allowable whitespace between tokens)."""
G = {' ': whitespace}
description = description.replace('\t', ' ') # no tabs!
for line in split(description, '\n'):
lhs, rhs = split(line, ' => ', 1)
alternatives = split(rhs, ' | ')
G[lhs] = tuple(map(split, alternatives))
return G
def decorator(d):
"Make function d a decorator: d wraps a function fn."
def _d(fn):
return update_wrapper(d(fn), fn)
update_wrapper(_d, d)
return _d
@decorator
def memo(f):
"""Decorator that caches the return value for each call to f(args).
Then when called again with same args, we can just look it up."""
cache = {}
def _f(*args):
try:
return cache[args]
except KeyError:
cache[args] = result = f(*args)
return result
except TypeError:
# some element of args can't be a dict key
return f(args)
return _f
def parse(start_symbol, text, grammar):
"""Example call: parse('Exp', '3*x + b', G).
Returns a (tree, remainder) pair. If remainder is '', it parsed the whole
string. Failure iff remainder is None. This is a deterministic PEG parser,
so rule order (left-to-right) matters. Do 'E => T op E | T', putting the
longest parse first; don't do 'E => T | T op E'
Also, no left recursion allowed: don't do 'E => E op T'"""
tokenizer = grammar[' '] + '(%s)'
def parse_sequence(sequence, text):
result = []
for atom in sequence:
tree, text = parse_atom(atom, text)
if text is None: return Fail
result.append(tree)
return result, text
@memo
def parse_atom(atom, text):
if atom in grammar: # Non-Terminal: tuple of alternatives
for alternative in grammar[atom]:
tree, rem = parse_sequence(alternative, text)
if rem is not None: return [atom]+tree, rem
return Fail
else: # Terminal: match characters against start of text
m = re.match(tokenizer % atom, text)
return Fail if (not m) else (m.group(1), text[m.end():])
# Body of parse:
return parse_atom(start_symbol, text)
Fail = (None, None)
JSON = grammar("""value => number | array | string | object | true | false | null
object => { } | { members }
members => pair , members | pair
pair => string : value
array => [[] []] | [[] elements []]
elements => value , elements | value
string => "[^"]*"
number => int frac exp | int frac | int exp | int
int => [-+]?[1-9][0-9]*
frac => [.][0-9]+
exp => [eE][-+][0-9]+""", whitespace='\s*')
def json_parse(text):
return parse('value', text, JSON)
def test():
assert json_parse('["testing", 1, 2, 3]') == (
['value', ['array', '[', ['elements', ['value',
['string', '"testing"']], ',', ['elements', ['value', ['number',
['int', '1']]], ',', ['elements', ['value', ['number',
['int', '2']]], ',', ['elements', ['value', ['number',
['int', '3']]]]]]], ']']], '')
assert json_parse('-123.456e+789') == (
['value', ['number', ['int', '-123'], ['frac', '.456'], ['exp', 'e+789']]], '')
assert json_parse('{"age": 21, "state":"CO","occupation":"rides the rodeo"}') == (
['value', ['object', '{', ['members', ['pair', ['string', '"age"'],
':', ['value', ['number', ['int', '21']]]], ',', ['members',
['pair', ['string', '"state"'], ':', ['value', ['string', '"CO"']]],
',', ['members', ['pair', ['string', '"occupation"'], ':',
['value', ['string', '"rides the rodeo"']]]]]], '}']], '')
return 'tests pass'
print test()
In [14]:
import re
def findtags(text):
parms = '(?:\w+\s*=\s*"[^"]*"\s*)*'
tags = '(<\s*\w+\s*' + parms + '\s*/?>)'
return re.findall(tags, text)
testtext1 = """
My favorite website in the world is probably
<a href="www.udacity.com">Udacity</a>. If you want
that link to open in a <b>new tab</b> by default, you should
write <a href="www.udacity.com"target="_blank">Udacity</a>
instead!
"""
testtext2 = """
Okay, so you passed the first test case. <let's see> how you
handle this one. Did you know that 2 < 3 should return True?
So should 3 > 2. But 2 > 3 is always False.
"""
testtext3 = """
It's not common, but we can put a LOT of whitespace into
our HTML tags. For example, we can make something bold by
doing < b > this < /b >, Though I
don't know why you would ever want to.
"""
def test():
assert findtags(testtext1) == ['<a href="www.udacity.com">',
'<b>',
'<a href="www.udacity.com"target="_blank">']
assert findtags(testtext2) == []
assert findtags(testtext3) == ['< b >']
return 'tests pass'
print test()
In [ ]: